package com.jtw.common.util;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * DESCRIPT: 爬取行政区划网站上的所有省、市、区、街道、城乡村数据
 *
 * @author cjsky666
 * @date 2019/5/28 13:21
 */
public class Spider_ChinaCity {

    public static String [] paths={
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html-110000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/12.html-120000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/13.html-130000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14.html-140000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/15.html-150000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/21.html-210000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/22.html-220000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/23.html-230000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/31.html-310000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/32.html-320000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/33.html-330000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/34.html-340000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/35.html-350000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/36.html-360000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/37.html-370000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/41.html-410000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/42.html-420000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/43.html-430000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/44.html-440000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/45.html-450000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/46.html-460000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/50.html-500000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/51.html-510000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/52.html-520000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/53.html-530000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/54.html-540000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/61.html-610000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/62.html-620000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/63.html-630000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/64.html-640000",
        "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/65.html-650000"
    };


    /**
     * 将json字段写入本地D盘
     *
     * @param str
     */
    public static void writeFile(String str,String fileName) {
        FileWriter fw = null;
        //设置日期格式
        SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");
        try {
            //经过测试：FileWriter执行耗时:3,9，5 毫秒
            fw = new FileWriter(fileName,true);
            fw.write(str);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                fw.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }







    /**
     *   行政区划数据格式
     *  `ID` 自增列  int
     *  `CODE` 区划代码，string
     *  `FULL_CODE` 统计用区划代码--国家统计局数据格式  string
     *  `PARENT`  上级区划代码 string
     *  `NAME` 简称 string
     *  `FULL_NAME` 全称 string
     *  `PIN_YIN` 拼音 String
     *  `TITLE_CASE` 首拼音 String
     *  `LNG` 精度 string
     *  `LAT` 纬度 string
     *  `SORT` 序号 shortint
     *  `CREATE_TIME` 创建时间 date
     *  `UPDATE_TIME` 更新时间 date
     *  `DESCRIPT` 备注描述 string
     *  `STATE` 状态 shortint
     */

    public static String BASE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
    public static String PROVINCE_TABLE_NAME = "`t_xzqh_province`";
    public static String CITY_TABLE_NAME = "`t_xzqh_city`";
    public static String COUNTRY_TABLE_NAME = "`t_xzqh_country`";
    public static String TWON_TABLE_NAME = "`t_xzqh_town`";
    public static String VILLAGE_TABLE_NAME = "`t_xzqh_village`";
    public static int FULL_CODE_LENGTH = 12;
    public static int PROVINCE_CODE_LENGTH = 6;
    public static int CITY_CODE_LENGTH = 6;
    public static int AREA_CODE_LENGTH = 6;
    public static int STREET_CODE_LENGTH = 6;
    public static String PROVINCE_SQL = "INSERT INTO "+PROVINCE_TABLE_NAME+" VALUES";
    public static String CITY_SQL = "INSERT INTO "+CITY_TABLE_NAME+" VALUES";
    public static String COUNTRY_SQL = "INSERT INTO "+COUNTRY_TABLE_NAME+" VALUES";
    public static String TOWN_SQL = "INSERT INTO "+TWON_TABLE_NAME+" VALUES";
    public static String VILLAGE_SQL = "INSERT INTO "+VILLAGE_TABLE_NAME+" VALUES";

    /**
     * 北京市 cityindex = 1 countryindex = 16 townindex=336 villageindex=7154
     * 天津市 cityindex = 2 countryindex = 17 townindex=337 villageindex=7155
     * 河北省 cityindex = 3 countryindex = 33 townindex=643 villageindex=12734
     * 山西省 cityindex = 14 countryindex = 235 townindex=2998 villageindex=66197
     * 内蒙古自治区 cityindex = 25 countryindex = 368 townindex=4483 villageindex=95880
     * 辽宁省 cityindex = 37 countryindex = 486 townindex=5755 villageindex=110472
     * 吉林省 cityindex = 51 countryindex = 600 townindex=7341 villageindex=126823
     * 黑龙江省 cityindex = 60 countryindex = 677 townindex=8386 villageindex=138470
     *
     * 上海市 cityindex = 73 countryindex = 823 townindex=10376 villageindex=152585
     * 江苏省 cityindex = 74 countryindex = 839 townindex=10610 villageindex=158598
     * 浙江省 cityindex = 87 countryindex = 959 townindex=12159 villageindex=180910
     * 安徽省 cityindex = 98 countryindex = 1060 townindex=13555 villageindex=211536
     * 福建省 cityindex = 114 countryindex = 1196 townindex=15200 villageindex=229801
     * 江西省 cityindex = 123 countryindex = 1290 townindex=16375 villageindex=246973
     * 山东省 cityindex = 134 countryindex = 1401 townindex=18158 villageindex=268424
     * 河南省 cityindex = 151 countryindex = 1574 townindex=20017 villageindex=346400
     * 湖北省 cityindex = 169 countryindex = 1773 townindex=22584 villageindex=398454
     * 湖南省 cityindex = 183 countryindex = 1890 townindex=24072 villageindex=427321
     * 广东省 cityindex = 197 countryindex = 2042 townindex=26078 villageindex=456712
     * 广西壮族自治区 cityindex = 218 countryindex = 2183 townindex=27795 villageindex=482344
     * 海南省 cityindex = 232 countryindex = 2308 townindex=29086 villageindex=498801
     * 重庆市 cityindex = 237 countryindex = 2336 townindex=29312 villageindex=501794
     * 四川省 cityindex = 239 countryindex = 2374 townindex=30345 villageindex=512998
     * 贵州省 cityindex = 260 countryindex = 2578 townindex=34987 villageindex=566576
     * 云南省 cityindex = 269 countryindex = 2671 townindex=36469 villageindex=584123
     * 西藏自治区 cityindex = 285 countryindex = 2808 townindex=37911 villageindex=598574
     * 陕西省 cityindex = 292 countryindex = 2888 townindex=38612 villageindex=604053
     * 甘肃省 cityindex = 302 countryindex = 3005 townindex=39946 villageindex=624568
     * 青海省 cityindex = 316 countryindex = 3104 townindex=41378 villageindex=642215
     * 宁夏回族自治区 cityindex = 324 countryindex = 3150 townindex=41808 villageindex=646922
     * 新疆维吾尔自治区 cityindex = 329 countryindex = 3178 townindex=42071 villageindex=649796
     *
     *
     */

    public static int cityindex=329;
    public static int countryindex=3178;
    public static int townindex=42071;
    public static int villageindex=649796;

    public static String PROVINCE_SQL_TXT = "/province.sql";
    public static String CITY_SQL_TXT = "/city.sql";
    public static String COUNTRY_SQL_TXT = "/country.sql";
    public static String TOWN_SQL_TXT = "/town.sql";
    public static String VILLAGE_SQL_TXT = "/vallage.sql";


    public static final int  timeout= 12000;
    public static final int speed = 500;//抓取频率

    public static HttpURLConnection connection = null;
    public static HttpURLConnection create(String path){
        URL url = null;
        try {
            url = new URL(path);
            connection = (HttpURLConnection)url.openConnection();
            //默认就是Get，可以采用post，大小写都行，因为源码里都toUpperCase了。
            connection.setRequestMethod("GET");
            //是否允许缓存，默认true。
            connection.setUseCaches(Boolean.FALSE);
            //是否开启输出输入，如果是post使用true。默认是false
            //connection.setDoOutput(Boolean.TRUE);
            //connection.setDoInput(Boolean.TRUE);
            //设置请求头信息
            connection.addRequestProperty("Connection", "close");
//设置连接主机超时（单位：毫秒）
            connection.setConnectTimeout(5000);
            //设置从主机读取数据超时（单位：毫秒）
            connection.setReadTimeout(timeout);
//设置Cookie
//            connection.addRequestProperty("Cookie","你的Cookies" );
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            connection.disconnect();
        }
        return connection;
    }

    /**
     * 获取所有村镇
     * @param url
     * @param parent
     */
    private static void getAllVillage(String url, String parent) {
        System.out.println("爬取村镇"+url);
        String html="";
        try {
//            Document doc = Jsoup.connect(url).timeout(timeout).get();
            Document doc = Jsoup.parse(create(url).getInputStream(),"GBK",url);
//            System.out.println(doc.toString());
            Elements clss = doc.select(".villagetr");
//            System.out.println(clss.toString());
            for(Element t:clss){
                Elements ass = t.select("a");
//                System.out.println(ass);
                if(ass.size()==0){
                    ass = t.select("td");
                }
//                System.out.println(ass);
                String link;
                String fullCode="";
                String code="";
                String name="";
                for(int  i = 0;i<ass.size();i++){
                    if(i==0){
                        fullCode = ass.get(i).text();
                        code = fullCode;
                    }
                    if(i==2){
                        name =  ass.get(i).text();
                    }
                }

                String sql = VILLAGE_SQL+"("
                        +villageindex+",'"
                        +code+"','"
                        +fullCode+"','"
                        +parent+"','"
                        +name+"','"
                        +name+"','"
                        +PinYinUtil.ToPinyinFirstCharUpperCase(name)+"','"
                        +PinYinUtil.ToFirstCharUpperCase(name)+"','"
                        +"','"
                        +"','"
                        +villageindex+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +"',"
                        +0+");\r";
                System.out.println(sql);
                html+=sql;
                villageindex++;
            }
            writeFile(html,VILLAGE_SQL_TXT);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("当前链接超时, 间隔一秒之后重新尝试");
            getAllVillage(url,parent);
        }
    }



    /**
     * 获取所有街道
     * @param url
     * @param parent
     */
    private static void getAllTown(String url, String parent) {
        System.out.println("爬取街道"+url);
        String html="";


        try {
            //            Document doc = Jsoup.connect(url).timeout(timeout).get();
            Document doc = Jsoup.parse(create(url).getInputStream(),"GBK",url);
            Elements clss = doc.select(".towntr");
            for(Element t:clss){
                Elements ass = t.select("a");
                if(ass==null||ass.size()==0){
                    ass = t.select("td");
                }
                String link;
                String fullCode="";
                String code="";
                String name="";
                for(int  i = 0;i<ass.size();i++){
                    if(i==0){
                        link = ass.get(i).attr("href");
                        fullCode = ass.get(i).text();
                        code = fullCode.substring(0,9);
                        if(StringUtils.isNotBlank(link)){
                            try {
                                Thread.sleep(speed);
                            getAllVillage(BASE_URL+link.split("/")[1].substring(0,2)+"/"+link.split("/")[1].substring(2,4)+"/"+link,code);
                            } catch (InterruptedException e) {
                                e.printStackTrace();
                            }
                        }
                    }
                    if(i==1){
                        name =  ass.get(i).text();
                    }
                }

                String sql = TOWN_SQL+"("
                        +townindex+",'"
                        +code+"','"
                        +fullCode+"','"
                        +parent+"','"
                        +name+"','"
                        +name+"','"
                        +PinYinUtil.ToPinyinFirstCharUpperCase(name)+"','"
                        +PinYinUtil.ToFirstCharUpperCase(name)+"','"
                        +"','"
                        +"','"
                        +townindex+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +"',"
                        +0+");\r";
                System.out.println(sql);
                html+=sql;
                townindex++;
            }
                writeFile(html,TOWN_SQL_TXT);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("当前链接超时, 间隔一秒之后重新尝试");
                getAllTown(url,parent);
        }
    }

    /**
     * 获取所有区县
     * @return
     */
    public static void getAllCountry(String url,String parent){
        System.out.println("爬取地区"+url);
       String html="";

        try {
            //            Document doc = Jsoup.connect(url).timeout(timeout).get();
            Document doc = Jsoup.parse(create(url).getInputStream(),"GBK",url);
            Elements clss = doc.select(".countytr");
            for(Element t:clss){
                Elements ass = t.select("a");
                if(ass==null||ass.size()==0){
                    ass = t.select("td");
                }
                String link;
                String fullCode="";
                String code="";
                String name="";
                for(int  i = 0;i<ass.size();i++){
                    if(i==0){
                        link = ass.get(i).attr("href");
                        fullCode = ass.get(i).text();
                        code = fullCode.substring(0,6);
                        if(StringUtils.isNotBlank(link)){
                            try {
                                Thread.sleep(speed);
                            getAllTown(BASE_URL+link.split("/")[1].substring(0,2)+"/"+link,code);
                            } catch (InterruptedException e) {
                                e.printStackTrace();
                            }
                        }
                    }
                    if(i==1){
                        name =  ass.get(i).text();
                    }
                }

                String sql = COUNTRY_SQL+"("
                        +countryindex+",'"
                        +code+"','"
                        +fullCode+"','"
                        +parent+"','"
                        +name+"','"
                        +name+"','"
                        +PinYinUtil.ToPinyinFirstCharUpperCase(name)+"','"
                        +PinYinUtil.ToFirstCharUpperCase(name)+"','"
                        +"','"
                        +"','"
                        +countryindex+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +"',"
                        +0+");\r";
                    System.out.println(sql);
                html+=sql;
                countryindex++;
            }
            writeFile(html,COUNTRY_SQL_TXT);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("当前链接超时, 间隔一秒之后重新尝试");
                getAllCountry(url,parent);
        }
    }




    /**
     * 获取所有城市
     * @return
     */
    public static void getAllCity(String url,String parent){

        System.out.println("爬取城市"+url);
        String html = "";
        try {
            //            Document doc = Jsoup.connect(url).timeout(timeout).get();
            Document doc = Jsoup.parse(create(url).getInputStream(),"GBK",url);
            Elements clss = doc.select(".citytr");
            for(Element t:clss){
//                System.out.println(t.html());
                Elements ass = t.select("a");
                String link;
                String fullCode="";
                String code="";
                String name="";
                for(int  i = 0;i<ass.size();i++){
                    if(i==0){
                        link = ass.get(i).attr("href");
                        fullCode = ass.get(i).text();
                        code = fullCode.substring(0,6);
                        if(StringUtils.isNotBlank(link)){
                            try {
                                Thread.sleep(speed);
                            getAllCountry(BASE_URL+link,code);
                            } catch (InterruptedException e) {
                                e.printStackTrace();
                            }
                        }
                    }
                    if(i==1){
                        name =  ass.get(i).text();
                    }
                }

                String sql = CITY_SQL+"("
                        +cityindex+",'"
                        +code+"','"
                        +fullCode+"','"
                        +parent+"','"
                        +name+"','"
                        +name+"','"
                        +PinYinUtil.ToPinyinFirstCharUpperCase(name)+"','"
                        +PinYinUtil.ToFirstCharUpperCase(name)+"','"
                        +"','"
                        +"','"
                        +cityindex+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                        +"',"
                        +0+");\r";
                    System.out.println(sql);
                    html+=sql;
                cityindex++;
            }
                writeFile(html,CITY_SQL_TXT);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("当前链接超时, 间隔一秒之后重新尝试");
                getAllCity(url,parent);
        }
    }

    /**
     * 获取所有省份
     */
    private static void getAllProvince(String url){
        System.out.println("爬取省份"+url);
        String html="";
        try {
            //            Document doc = Jsoup.connect(url).timeout(timeout).get();
            Document doc = Jsoup.parse(create(url).getInputStream(),"GBK",url);
            Elements clss = doc.select(".provincetr");
            int index = 1;
            for(Element t:clss){
                Elements ass = t.select("a");
                for(Element a:ass){
                    String link = a.attr("href");
                    String code = link.split("\\.")[0];
                    code+=getFillCode(PROVINCE_CODE_LENGTH-code.length());
//                    getAllCity(BASE_URL+link,code);
                    System.out.println(BASE_URL+link+"-------"+code);
                    String fullCode =(code+getFillCode(FULL_CODE_LENGTH-code.length()));
                    String name = a.text();
                    String sql = PROVINCE_SQL+"("
                            +index+",'"
                            +code+"','"
                            +fullCode+"','"
                            +"','"
                            +name+"','"
                            +name+"','"
                            +PinYinUtil.ToPinyinFirstCharUpperCase(name)+"','"
                            +PinYinUtil.ToFirstCharUpperCase(name)+"','"
                            +"','"
                            +"','"
                            +index+"','"
                            +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                            +DateUtil.format(new Date(),DateUtil.YMD24H_DATA)+"','"
                            +"',"
                            +0+");\r";
//                    System.out.println(sql);
                    html+=sql;
                    index++;
                }
                writeFile(html,PROVINCE_SQL_TXT);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 填充0补位
     * @param Length
     * @return
     */
    public static String getFillCode(Integer Length){
        String fillCode="";
        for(int i =0;i<Length;i++){
            fillCode+="0";
        }
        return fillCode;
    }

    public static void main(String[] args) {
        int ab = 30;
        long start = new Date().getTime();
        DateTime now = DateTime.now();
        System.out.println(start);
//        getAllProvince(BASE_URL);
        getAllCity(paths[ab].split("-")[0],paths[ab].split("-")[1]);
//        getAllVillage("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/14/08/29/140829102.html","140829102");
        DateTime now1 = DateTime.now();
        long end = new Date().getTime();
        System.out.println("总耗时"+(end-start));
    }
}
